Demo - 18 - Insurance Bill¶
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
from plotly.offline import init_notebook_mode
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# import warnings
# warnings.filterwarnings('ignore')
In [59]:
data = pd.read_csv('insurance.csv')
data.head(5)
Out[59]:
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
In [55]:
data.describe(include='all')
Out[55]:
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| count | 1338.000000 | 1338 | 1338.000000 | 1338.000000 | 1338 | 1338 | 1338.000000 |
| unique | NaN | 2 | NaN | NaN | 2 | 4 | NaN |
| top | NaN | male | NaN | NaN | no | southeast | NaN |
| freq | NaN | 676 | NaN | NaN | 1064 | 364 | NaN |
| mean | 39.207025 | NaN | 30.663397 | 1.094918 | NaN | NaN | 13270.422265 |
| std | 14.049960 | NaN | 6.098187 | 1.205493 | NaN | NaN | 12110.011237 |
| min | 18.000000 | NaN | 15.960000 | 0.000000 | NaN | NaN | 1121.873900 |
| 25% | 27.000000 | NaN | 26.296250 | 0.000000 | NaN | NaN | 4740.287150 |
| 50% | 39.000000 | NaN | 30.400000 | 1.000000 | NaN | NaN | 9382.033000 |
| 75% | 51.000000 | NaN | 34.693750 | 2.000000 | NaN | NaN | 16639.912515 |
| max | 64.000000 | NaN | 53.130000 | 5.000000 | NaN | NaN | 63770.428010 |
In [56]:
data.isnull().sum()
Out[56]:
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
In [57]:
data.isna().sum()
Out[57]:
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
Label Encoding¶
In [60]:
#sex
le = LabelEncoder()
le.fit(data.sex.drop_duplicates())
data.sex = le.transform(data.sex)
le_sex_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Gender/Sex label mapping", le_sex_mapping)
# smoker or not
le.fit(data.smoker.drop_duplicates())
data.smoker = le.transform(data.smoker)
le_smoker_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Smoker label mapping", le_smoker_mapping)
#region
le.fit(data.region.drop_duplicates())
data.region = le.transform(data.region)
le_region_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Region label mapping", le_region_mapping)
Gender/Sex label mapping {'female': 0, 'male': 1}
Smoker label mapping {'no': 0, 'yes': 1}
Region label mapping {'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3}
In [45]:
data.corrwith(data['charges']).sort_values(ascending=False)
Out[45]:
charges 1.000000 smoker 0.787251 age 0.299008 bmi 0.198341 children 0.067998 sex 0.057292 region -0.006208 dtype: float64
In [46]:
# Alternatively
data.corr()['charges'].sort_values(ascending=False)
Out[46]:
charges 1.000000 smoker 0.787251 age 0.299008 bmi 0.198341 children 0.067998 sex 0.057292 region -0.006208 Name: charges, dtype: float64
In [47]:
data
Out[47]:
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | 0 | 27.900 | 0 | 1 | 3 | 16884.92400 |
| 1 | 18 | 1 | 33.770 | 1 | 0 | 2 | 1725.55230 |
| 2 | 28 | 1 | 33.000 | 3 | 0 | 2 | 4449.46200 |
| 3 | 33 | 1 | 22.705 | 0 | 0 | 1 | 21984.47061 |
| 4 | 32 | 1 | 28.880 | 0 | 0 | 1 | 3866.85520 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | 1 | 30.970 | 3 | 0 | 1 | 10600.54830 |
| 1334 | 18 | 0 | 31.920 | 0 | 0 | 0 | 2205.98080 |
| 1335 | 18 | 0 | 36.850 | 0 | 0 | 2 | 1629.83350 |
| 1336 | 21 | 0 | 25.800 | 0 | 0 | 3 | 2007.94500 |
| 1337 | 61 | 0 | 29.070 | 0 | 1 | 1 | 29141.36030 |
1338 rows × 7 columns
Heatmap¶
In [9]:
f, ax = pl.subplots(figsize=(10, 8))
corr = data.corr()
sns.heatmap(corr, square=True, ax=ax, annot=True, cmap='mako')
Out[9]:
<Axes: >
Distribution of charges¶
In [10]:
sns.displot(data, x="charges", height=8)
Out[10]:
<seaborn.axisgrid.FacetGrid at 0x16c5b7370>
In [11]:
sns.displot(data, x="charges", height=8, log_scale=True)
Out[11]:
<seaborn.axisgrid.FacetGrid at 0x16c48eb50>
Charge on smoker and non-smoker¶
In [12]:
f= pl.figure(figsize=(16,6))
ax=f.add_subplot(121)
sns.histplot(data[(data.smoker == 1)]["charges"],color='c',ax=ax)
ax.set_title('Distribution of charges for smokers')
ax=f.add_subplot(122)
sns.histplot(data[(data.smoker == 0)]['charges'],color='b',ax=ax)
ax.set_title('Distribution of charges for non-smokers')
Out[12]:
Text(0.5, 1.0, 'Distribution of charges for non-smokers')
In [13]:
# Log-scale
f= pl.figure(figsize=(16,6))
ax=f.add_subplot(121)
sns.histplot(data[(data.smoker == 1)]["charges"],
color='c', ax=ax, log_scale=True,)
ax.set_title('Distribution of charges for smokers')
ax=f.add_subplot(122)
sns.histplot(data[(data.smoker == 0)]['charges'],
color='b', ax=ax, log_scale=True,)
ax.set_title('Distribution of charges for non-smokers')
Out[13]:
Text(0.5, 1.0, 'Distribution of charges for non-smokers')
Patient gender count - Smoker and non-smoker¶
In [14]:
# 0:female 1:male
# 0:non-smoker 1:smoker
In [15]:
import matplotlib.pyplot as plt
f = sns.catplot(x="smoker", kind="count",hue = 'sex',
palette="pink", data=data, legend=False)
plt.legend(title='Smoker', loc='upper right',
labels=['Female', 'Male'])
plt.show(f)
In [16]:
le.inverse_transform(data['smoker'])
Out[16]:
array(['northwest', 'northeast', 'northeast', ..., 'northeast',
'northeast', 'northwest'], dtype=object)
Distribution on charge – smoker vs non smoker¶
In [17]:
# 0:female 1:male
# 0:non-smoker 1:smoker
sns.boxplot(x="sex", y="charges", hue="smoker",
palette=["g", "m"], data=data,)
Out[17]:
<Axes: xlabel='sex', ylabel='charges'>
Box plot for charges of men¶
In [62]:
# 0: non-smoker; 1: smoker
pl.figure(figsize=(12,5))
pl.title("Box plot for charges of men")
sns.boxplot(y="smoker", x="charges", data = data[(data.sex == 1)] ,
orient="h", palette = 'magma', hue="smoker")
Out[62]:
<Axes: title={'center': 'Box plot for charges of men'}, xlabel='charges', ylabel='smoker'>
Box plot for charges of women¶
In [61]:
pl.figure(figsize=(12,5))
pl.title("Box plot for charges of women")
sns.boxplot(y="smoker", x="charges", data = data[(data.sex == 0)] , orient="h",
palette = 'rainbow', hue='smoker' )
Out[61]:
<Axes: title={'center': 'Box plot for charges of women'}, xlabel='charges', ylabel='smoker'>
Distribution of age¶
In [20]:
pl.figure(figsize=(12,5))
pl.title("Distribution of age")
ax = sns.histplot(data["age"], color = 'g')
The number of smokers and non-smokers (18 years old)¶
In [21]:
sns.catplot(x="smoker", kind="count",hue = 'sex', palette="rainbow",
data=data[(data.age == 18)])
pl.title("The number of smokers and non-smokers (18 years old)")
Out[21]:
Text(0.5, 1.0, 'The number of smokers and non-smokers (18 years old)')
18 years old - a very young age. Does smoking affect the cost of treatment at this age?¶
In [22]:
pl.figure(figsize=(12,5))
pl.title("Box plot for charges 18 years old smokers")
sns.boxplot(y="smoker", x="charges", data = data[(data.age == 18)] , orient="h", hue='smoker', palette = 'pink')
Out[22]:
<Axes: title={'center': 'Box plot for charges 18 years old smokers'}, xlabel='charges', ylabel='smoker'>
Distribution of charges and age for non-smokers¶
In [23]:
g = sns.jointplot(
data=data[(data.smoker == 0)], x="age", y="charges",
kind="kde", color="g")
ax.set_title('Distribution of charges and age for non-smokers')
Out[23]:
Text(0.5, 1.0, 'Distribution of charges and age for non-smokers')
Distribution of charges and age for smokers¶
In [24]:
g = sns.jointplot(
data=data[(data.smoker == 1)], x="age", y="charges",
kind="kde", color="grey")
ax.set_title('Distribution of charges and age for smokers')
Out[24]:
Text(0.5, 1.0, 'Distribution of charges and age for smokers')
Non smoker charge distribution on age¶
In [25]:
fig = px.scatter(data[(data.smoker == 0)], x="age", y="charges",
log_x=True, size_max=60, title='Non smoker charge distribution on age')
fig.show()
Smoker charge distribution on age¶
In [26]:
fig = px.scatter(data[(data.smoker == 1)], x="age", y="charges",
log_x=True, size_max=60,
title='Smoker charge distribution on age')
fig.show()
Smokers and non-smokers on age¶
In [27]:
sns.lmplot(x="age", y="charges", hue="smoker", data=data, height=6 )
ax.set_title('Smokers and non-smokers')
Out[27]:
Text(0.5, 1.0, 'Smokers and non-smokers')
Distribution of bmi¶
In [28]:
pl.figure(figsize=(12,5))
pl.title("Distribution of bmi")
ax = sns.histplot(data["bmi"], color = 'm')
Distribution of charges for patients with BMI greater than 30¶
In [29]:
pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI greater than 30")
ax = sns.histplot(data[(data.bmi >= 30)]['charges'], color = 'r')
Distribution of charges for patients with BMI less than 30¶
In [30]:
pl.figure(figsize=(12,5))
pl.title("Distribution of charges for patients with BMI less than 30")
ax = sns.histplot(data[(data.bmi < 30)]['charges'], color = 'b')
Scatter plot of charges and bmi¶
In [31]:
pl.figure(figsize=(10,6))
ax = sns.scatterplot(x='bmi',y='charges',data=data,palette='magma',hue='smoker')
ax.set_title('Scatter plot of charges and bmi')
sns.lmplot(x="bmi", y="charges", hue="smoker", data=data, palette = 'magma')
Out[31]:
<seaborn.axisgrid.FacetGrid at 0x303b61730>
Child patient count with how many sibling¶
In [32]:
sns.catplot(x="children", kind="count", palette="ch:.25", data=data, hue='children')
Out[32]:
<seaborn.axisgrid.FacetGrid at 0x303a9c910>
Smokers and non-smokers who have childrens¶
In [33]:
sns.catplot(x="smoker", kind="count", palette="rainbow",hue = "sex",
data=data[(data.children > 0)])
ax.set_title('Smokers and non-smokers who have childrens')
Out[33]:
Text(0.5, 1.0, 'Smokers and non-smokers who have childrens')
In [ ]: